/** 
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *     Author:eliteqing@foxmail.com
 * 
 */
package edu.uci.ics.crawler4j.tests.ziroom;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Set;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.csvreader.CsvWriter;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;

/**
 * @date 2016年8月20日 下午6:13:24
 * @version
 * @since JDK 1.8
 */
public class ZiroomCrawler extends WebCrawler {

	/** 爬取匹配原则 */
	private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g|ico"
			+ "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
	/** 爬取数据保存文件路径 */
	private final static String DATA_PATH = "data/crawl/ziroom.csv";
	/** 爬取link文件路径 */
	private final static String LINK_PATH = "data/crawl/link.csv";
	// private static final Logger logger =
	// LoggerFactory.getLogger(ZiroomCrawler.class);

	private final static String URL_PREFIX = "http://sh.ziroom.com/z/nl/";

	private final File fLinks;
	private final File fDatas;

	private CsvWriter csvLinks;
	private CsvWriter csvDatas;

	/**
	 * You should implement this function to specify whether the given url
	 * should be crawled or not (based on your crawling logic).
	 */
	ZiroomCrawlStat myCrawlStat;

	public ZiroomCrawler() throws IOException {
		myCrawlStat = new ZiroomCrawlStat();
		fLinks = new File(DATA_PATH);
		fDatas = new File(LINK_PATH);
		if (fLinks.isFile()) {
			fLinks.delete();
		}
		if (fDatas.isFile()) {
			fDatas.delete();
		}
		csvDatas = new CsvWriter(new FileWriter(fDatas, true), ',');
		csvDatas.write("请求路径");
		csvDatas.endRecord();
		csvDatas.close();
		csvLinks = new CsvWriter(new FileWriter(fLinks, true), ',');
		csvLinks.write("图片");
		csvLinks.write("价格");
		csvLinks.write("地址");
		csvLinks.write("说明");
		csvLinks.endRecord();
		csvLinks.close();
	}

	public void dumpMyData() {
		final int id = getMyId();
		// You can configure the log to output to file
		logger.info("Crawler {} > Processed Pages: {}", id, myCrawlStat.getTotalProcessedPages());
		logger.info("Crawler {} > Total Links Found: {}", id, myCrawlStat.getTotalLinks());
		logger.info("Crawler {} > Total Text Size: {}", id, myCrawlStat.getTotalTextSize());
	}

	@Override
	public Object getMyLocalData() {
		return myCrawlStat;
	}

	@Override
	public void onBeforeExit() {
		dumpMyData();
	}

	/*
	 * 这个方法决定了要抓取的URL及其内容，例子中只允许抓取“http://sh.ziroom.com/z/nl/”这个域的页面,
	 * 不允许.css、.js和多媒体等文件
	 *
	 * @see edu.uci.ics.crawler4j.crawler.WebCrawler#shouldVisit(edu.uci.ics.
	 * crawler4j.crawler.Page, edu.uci.ics.crawler4j.url.WebURL)
	 */
	@Override
	public boolean shouldVisit(Page referringPage, WebURL url) {
		final String href = url.getURL().toLowerCase();

		if (FILTERS.matcher(href).matches() || !href.startsWith(URL_PREFIX)) {
			return false;
		}
		return true;
	}

	/*
	 * 当URL下载完成会调用这个方法。你可以轻松获取下载页面的url, 文本, 链接, html,和唯一id等内容。
	 *
	 * @see
	 * edu.uci.ics.crawler4j.crawler.WebCrawler#visit(edu.uci.ics.crawler4j.
	 * crawler.Page)
	 */
	@Override
	public void visit(Page page) {
		final String url = page.getWebURL().getURL();
		logger.info("爬取路径：" + url);
		myCrawlStat.incProcessedPages();
		if (page.getParseData() instanceof HtmlParseData) {
			final HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
			final Set<WebURL> links = htmlParseData.getOutgoingUrls();
			try {
				linkToCsv(links);
			} catch (final IOException e2) {
				// TODO Auto-generated catch block
				e2.printStackTrace();
			}
			myCrawlStat.incTotalLinks(links.size());
			try {
				myCrawlStat.incTotalTextSize(htmlParseData.getText().getBytes("UTF-8").length);
			} catch (final UnsupportedEncodingException e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			}
			final String html = htmlParseData.getHtml();

			final Document doc = Jsoup.parse(html);

			final Elements contents = doc.select("li[class=clearfix]");

			for (final Element c : contents) {
				// 图片
				final String img = c.select(".img img").first().attr("src");
				logger.debug("图片：" + img);

				// 地址
				final Element txt = c.select("div[class=txt]").first();
				final String arr1 = txt.select("h3 a").first().text();
				final String arr2 = txt.select("h4 a").first().text();
				final String arr3 = txt.select("div[class=detail]").first().text();

				final String arr = arr1.concat(arr1 + ",").concat(arr2 + ",").concat(arr3);
				logger.debug("地址：" + arr);
				// 说明
				final String rank = txt.select("p").first().text();
				logger.debug("说明：" + rank);

				// 价格
				final String pirce = c.select("p[class=price]").first().text();

				try {
					csvLinks = new CsvWriter(new FileWriter(fLinks, true), ',');
					csvLinks.write(img);
					csvLinks.write(pirce);
					csvLinks.write(arr);
					csvLinks.write(rank);
					csvLinks.endRecord();
					csvLinks.flush();
					csvLinks.close();
				} catch (final IOException e) {
					e.printStackTrace();
				}
			}
		}
	}

	private void linkToCsv(Set<WebURL> links) throws IOException {
		csvDatas = new CsvWriter(new FileWriter(fDatas, true), ',');
		for (final WebURL webURL : links) {
			csvDatas.write(webURL.getURL());
		}
		csvDatas.flush();
		csvDatas.endRecord();
		csvDatas.close();
	}
}
